import pandas as pd
import numpy as np
from copy import deepcopy
from clustergrammer2 import net
df = {}
import clustergrammer_groupby as cby
# load json to dict
def load_to_dict( filename ):
import json
# load
f = open(filename,'r')
inst_dict = json.load(f)
f.close()
return inst_dict
# save dict to json
def save_to_json(inst_dict, filename, indent=True):
import json
# save as a json
fw = open(filename, 'w')
if indent == True:
fw.write( json.dumps(inst_dict, indent=2) )
else:
fw.write( json.dumps(inst_dict) )
fw.close()
ad1 = load_to_dict('../big_data/address_1k.json')
ad2 = load_to_dict('../big_data/address_2k.json')
address_dict = {}
for inst_row in ad1:
address_dict[inst_row] = ad1[inst_row]
for inst_row in ad2:
address_dict[inst_row] = ad2[inst_row]
save_to_json(filename='../big_data/address_dict.json', inst_dict=address_dict)
len(list(address_dict.keys()))
df['ini'] = pd.read_csv('../challenge_data/dvs_challenge_1_membership_time_space.csv')
country_dict = {}
city_dict = {}
lat_dict = {}
lng_dict = {}
for inst_row in df['ini'].index.tolist():
lat_dict[str(inst_row)] = df['ini'].loc[inst_row]['lat']
lng_dict[str(inst_row)] = df['ini'].loc[inst_row]['long']
inst_row = str(inst_row)
if str(inst_row) in address_dict:
inst_address = address_dict[inst_row].split(', ')
inst_country = inst_address[-1]
try:
inst_city = inst_address[-4]
except:
inst_city = 'N.A.'
else:
inst_country = 'N.A.'
inst_city = 'N.A.'
country_dict[inst_row] = inst_country
city_dict[inst_row] = inst_city
df['ini'].head()
df['clean'] = deepcopy(df['ini'])
df['clean'] = df['clean'].drop(['lat', 'long', 'date_with_hour', 'date'], axis=1)
df['clean'].shape
df['clean'].head()
df['cat'] = deepcopy(df['clean'].transpose())
cols = df['cat'].columns.tolist()
new_cols = [('P-' + str(x),
'Country: ' + country_dict[str(x)],
'City: ' + city_dict[str(x)],
'Lat: ' + str(lat_dict[str(x)]),
'Long: ' + str(lng_dict[str(x)])
) for x in cols]
df['cat'].columns = new_cols
rows = df['cat'].index.tolist()
mat = df['cat'].get_values().astype('float')
df['proc'] = pd.DataFrame(columns=new_cols, index=rows, data=mat)
cols = df['proc'].columns.tolist()
keep_cols = [x for x in cols if 'N.A.' not in x[1]]
print(len(cols), len(keep_cols))
df['prot'] = df['proc'][keep_cols]
df['proc'].head()
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: USA', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: United Kingdom', inst_color='white')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: Canada', inst_color='red')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: India', inst_color='green')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: Australia', inst_color='black')
net.load_df(df['proc'])
net.swap_nan_for_zero()
net.normalize(axis='row', norm_type='zscore')
net.widget()
df_sig, keep_genes_dict, df_gene_pval, all_fold_info = cby.generate_signatures(df['proc'], category_level='Country')
net.load_df(df_sig)
net.normalize(axis='row', norm_type='zscore')
net.widget()
net.set_cat_color(axis='col', cat_index=2, cat_name='City: New York City', inst_color='blue')
net.set_cat_color(axis='col', cat_index=1, cat_name='City: San Francisco and County', inst_color='white')
net.set_cat_color(axis='col', cat_index=1, cat_name='City: Washington', inst_color='red')
net.set_cat_color(axis='col', cat_index=1, cat_name='Country: India', inst_color='green')
net.load_df(df['proc'])
net.filter_cat(axis='col', cat_index=1, cat_name='Country: USA')
df['usa'] = net.export_df()
net.normalize(axis='row', norm_type='zscore')
net.widget()
df_sig, keep_genes_dict, df_gene_pval, all_fold_info = cby.generate_signatures(df['usa'], category_level='City')
df_sig.shape
net.load_df(df_sig)
net.normalize(axis='row', norm_type='zscore')
net.widget()